read_tsv("./../data/_raw/raw.tsv",
show_col_types = FALSE) |>
write_tsv("../data/01_dat_load.tsv")00_all.qmd
01_load
Load data
02_clean
Load Data
full_data <- read_tsv("./../data/01_dat_load.tsv",
show_col_types = FALSE)Split Data
We have extracted one column (sample_tags) that contains comma-separated values into its own dataframe. This allows us to perform separate cleaning operations on it while showcasing our proficiency in joining dataframes.
patient_data <- full_data |>
select(sample_name, sample_tags)
meta_data <- full_data |>
select(!sample_tags)Clean Data
patient_data <- patient_data |>
mutate(
case_control = case_when(
str_detect(string = sample_tags,
pattern = "Case") ~ "Case",
str_detect(string = sample_tags,
pattern = "Control") ~ "Control",
TRUE ~ NA),
id = case_when(
str_detect(string = sample_tags,
pattern = "Subject") ~ as.double(
str_extract(string = sample_tags,
pattern = "(?<=Subject[^,])\\d+")),
str_detect(string = sample_tags,
pattern = "Control") ~ as.double(
str_extract(string = sample_tags,
pattern = "(?<=Control[^,])\\d+")),
TRUE ~ NA),
timepoint = as.double(
str_extract(string = sample_tags,
pattern = "(?<=Timepoint)\\s+\\d+[.\\d+]?")),
gender = case_when(str_detect(string = sample_tags,
pattern = "Female") ~ "Female",
str_detect(string = sample_tags,
pattern = "Male") ~ "Male"),
ethnicity = case_when(str_detect(string = sample_tags,
pattern = "Non-Hispanic") ~ "Non-Hispanic",
str_detect(string = sample_tags,
pattern = "Hispanic") ~ "Hispanic",
TRUE ~ as.character(sample_tags)),
race = case_when(str_detect(string = sample_tags,
pattern = "Caucasian") ~ "Caucasian",
str_detect(string = sample_tags,
pattern = "Asian") ~ "Asian",
TRUE ~ as.character(sample_tags)),
age = round(if_else(
is.na(as.double(str_extract(string = sample_tags,
pattern = "\\d+(?=\\s+(Years)\\,)"))),
(1/12)*as.double(str_extract(string = sample_tags,
pattern = "\\d+(?=\\s+(Months)\\,)")),
as.double(str_extract(string = sample_tags,
pattern = "\\d+(?=\\s+(Years)\\,)"))),
digits = 2),
age_diagnosis = as.double(
str_extract(string = sample_tags,
pattern = "\\d*\\.\\d+(?=\\sYears\\sat\\sdiagnosis)")),
age_visit = as.double(
str_extract(string = sample_tags,
pattern = "\\d+(?=\\sYears\\sat\\svisit)")),
GAD65 = as.double(
str_extract(string = sample_tags,
pattern = "(?<=GAD65\\s)\\d+")),
IA_2 = as.double(
str_extract(string = sample_tags,
pattern = "(?<=IA\\-2\\s)\\d+")),
IAA = as.double(
str_extract(string = sample_tags,
pattern = "(?<=IAA )[-+]?[0-9]*\\.?[0-9]+")),
ZnT8 = as.double(
str_extract(string = sample_tags,
pattern = "(?<=ZnT8\\s)\\d+\\.\\d+"))
) |>
mutate(
HLA_A = str_extract_all(string = sample_tags,
pattern = "(?<=HLA\\-{1}A\\*{1})\\d{4}"),
HLA_B = str_extract_all(string = sample_tags,
pattern = "(?<=HLA\\-{1}B\\*{1})\\d{4}"),
HLA_C = str_extract_all(string = sample_tags,
pattern = "(?<=HLA\\-{1}C\\*{1})\\d{4}"),
HLA_DPA1 = str_extract_all(string = sample_tags,
pattern = "(?<=HLA\\-{1}DPA1\\*{1})\\d{4}"),
HLA_DPB1 = str_extract_all(string = sample_tags,
pattern = "(?<=HLA\\-{1}DPB1\\*{1})\\d{4}"),
HLA_DQA1 = str_extract_all(string = sample_tags,
pattern = "(?<=HLA\\-{1}DQA1\\*{1})\\d{4}"),
HLA_DQB1 = str_extract_all(string = sample_tags,
pattern = "(?<=HLA\\-{1}DQB1\\*{1})\\d{4}"),
HLA_DRB1 = str_extract_all(string = sample_tags,
pattern = "(?<=HLA\\-{1}A\\*{1})\\d{4}")
) |>
unnest_wider(col = starts_with("HLA_"),
names_sep = ",") |>
select(!sample_tags)Join Data
full_data <- meta_data |>
inner_join(patient_data,
by = "sample_name")Write Data
full_data |>
write_tsv("./../data/02_dat_clean.tsv")
03_augment
Select data
read_tsv("./../data/02_dat_clean.tsv",
show_col_types = FALSE) |>
filter(!str_detect(string = sample_name,
pattern = "Denver")) |>
arrange(case_control, id, timepoint) |>
pivot_longer(cols = c(GAD65, ZnT8, IA_2, IAA),
names_to = "antibody",
values_to = "expression") |>
pivot_longer(starts_with('HLA_'),
names_to = "allele",
values_to = "subtype") |>
mutate(allele = str_remove_all(string = allele,
pattern = "(\\,\\d)")) |>
write_tsv("./../data/03_dat_aug.tsv")
04_describe
Load Data
#|message: false
data <- read_tsv("./../data/03_dat_aug.tsv",
show_col_types = FALSE)Data Description
This dataset comprises a comprehensive registry detailing immunological and demographic characteristics, encompassing 216 observations across 41 variables. It captures data from 54 (29 cases and 25 control) patients across four distinct visits, including demographic details like gender, ethnicity, race, and age at diagnosis and visit. It encompasses key immunological markers such as GAD65, IA_2, IAA, ZnT8 antibodies, along with extensive HLA allele information (HLA_A, HLA_B, HLA_C, HLA_DPA1, HLA_DPB1, HLA_DQA1, HLA_DQB1, HLA_DRB1). Additionally, it contains metrics (e.g., total_templates, total_rearrangements, productive_templates) about the T-cell receptor β region, crucial for diversifying T-cell receptors and recognizing antigens. This dataset elucidates the intricate relationship between demographic factors and immunological profiles across multiple patient visits, providing invaluable insights for in-depth analysis and comprehension. Refer to the complete list of variables provided below.
Variables
data |>
tbl_vars()<dplyr:::vars>
[1] "sample_name" "total_templates"
[3] "productive_templates" "fraction_productive"
[5] "total_rearrangements" "productive_rearrangements"
[7] "productive_simpson_clonality" "max_productive_frequency"
[9] "locus" "sku"
[11] "test_name" "case_control"
[13] "id" "timepoint"
[15] "gender" "ethnicity"
[17] "race" "age"
[19] "age_diagnosis" "age_visit"
[21] "antibody" "expression"
[23] "allele" "subtype"
Age
data |>
drop_na(gender, age, case_control) |>
select(gender, age, case_control, id, timepoint) |>
distinct() |>
mutate(age = case_when(age < 5 ~ factor("(0,5]"),
age < 10 ~ factor("(5,10]"),
age < 15 ~ factor("(10,15]"),
age < 20 ~ factor("(15,20]"),
age < 25 ~ factor("(20,25]"))) |>
filter(!is.na(age)) |>
ggplot(mapping = aes(x = age,
fill = case_control)) +
geom_bar(alpha = 0.8,
position = position_dodge(preserve = "single"),
color = "black") +
theme_bw() +
labs(title = "Age group stratified by case/control per timepoint",
x = "Age",
y = "Count",
fill = "Case/Control") +
scale_fill_manual(values = c(Case = "#481567",
Control = "#20A387")) +
facet_wrap(~timepoint) +
theme(legend.position = "bottom")ggsave(width = 8,
height = 3,
filename = "04_plot_1.png",
path = "./../results/")
data |>
drop_na(age_diagnosis) |>
ggplot(data = _,
mapping = aes(x = age_diagnosis)) +
geom_boxplot(color = "#481567",
fill = "#20A387",
alpha = 0.8,
size = 0.5) +
theme_bw() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(title = "Age at diagnosis",
x = '')ggsave(width = 8,
height = 3,
filename = "04_plot_2.png",
path = "./../results/")The age distribution of patients predominantly falls within the 0-5 age group. Notably, more than half were diagnosed between ages 10 and 15. The participants underwent testing from early childhood into their twenties. Among the 29 individuals, only 5 received diagnoses after age 15, while 24 were diagnosed before reaching that age. Given the data-set is limited size of 29 observations, the analysis results might diverge from the actual pattern.
TCR-Beta diversity
Total rearrangement
data |>
group_by(case_control, age_visit, timepoint) |>
summarize(total_temp_mu = log(mean(total_rearrangements))) |>
ggplot(data = _,
mapping = aes(x = age_visit,
y = total_temp_mu)) +
geom_point(mapping = aes(color = timepoint)) +
labs(title = "Mean of Total Rearrangements vs. Age at visit",
subtitle = "Faceted by outcome and stratified by timepoint",
x = "Age",
y = "log( Total Rearrangement )",
color = "Timepoint") +
scale_color_viridis() +
geom_smooth(method = "lm",
color = "black") +
facet_grid(vars(case_control)) +
theme_bw()ggsave(width = 8,
height = 6,
filename = "04_plot_3.png",
path = "./../results/")Productive rearrangement
data |>
group_by(case_control, age_visit, timepoint) |>
summarize(prod_temp_mu = log(mean(productive_rearrangements))) |>
ggplot(data = _,
mapping = aes(x = age_visit,
y = prod_temp_mu)) +
geom_point(mapping = aes(color = timepoint)) +
labs(title = "Mean of Productive Rearrangements vs. Age at visit",
subtitle = "Faceted by outcome and stratified by timepoint",
x = "Age",
y = "log( Productive Rearrangement )",
color = "Timepoint") +
scale_color_viridis() +
geom_smooth(method = "lm",
color = "black") +
facet_grid(vars(case_control)) +
theme_bw()ggsave(width = 8,
height = 6,
filename = "04_plot_4.png",
path = "./../results/")We see a slight rise in total and productive rearrangements for case patients and a decrease for control patients, when comparing the values against age at visit. So the number of rearrangements seems to be stable in case patients during aging, however there is decreasing rearrangement for the control patients.
Fraction productive rearrangement
data |>
group_by(case_control,
age_visit,
timepoint) |>
summarize(frac_prod = mean(productive_rearrangements) / mean(total_rearrangements)) |>
ggplot(data = _,
mapping = aes(x = age_visit,
y = frac_prod)) +
geom_point(mapping = aes(color = timepoint)) +
labs(title = "Fraction of Productive Rearrangements vs. Age at visit",
subtitle = "Faceted by outcome and stratified by timepoint",
x = "Age",
y = "Fraction of Productive Rearrangements",
color = "Timepoint") +
scale_color_viridis() +
geom_smooth(method = "lm",
color = "black") +
facet_grid(vars(case_control)) +
theme_bw()ggsave(width = 8,
height = 6,
filename = "04_plot_5.png",
path = "./../results/")We see a steep rise in the fraction of productive rearrangement with age (regardless of outcome for patients), which must mean that the differences in values between productive rearrangement and total rearrangement become smaller with age.
Productive Simpson Clonality
data |>
ggplot(data = _,
mapping = aes(x = age_visit,
y = log(productive_simpson_clonality))) +
geom_point(mapping = aes(color = timepoint)) +
labs(title = "Productive Simpson Clonality vs. Age",
subtitle = "Faceted by outcome and stratified by timepoint",
x = "Age",
y = "log( Productive Simpson Clonality )") +
scale_color_viridis() +
geom_smooth(method = "lm",
color = "black") +
facet_grid(vars(case_control)) +
theme_bw()ggsave(width = 8,
height = 6,
filename = "04_plot_6.png",
path = "./../results/")Both case and control show a direct proportionality between productive simpson clonality and age, which shows the decrease of polyclonality with age.
05_antibody_expression_analysis
Load Data
data <- read_tsv("./../data/03_dat_aug.tsv",
show_col_types = FALSE)Data Analysis
data |>
group_by(antibody, timepoint, case_control) |>
summarise(mean_expression = mean(expression,
na.rm = TRUE),
.groups = "drop") |>
ggplot(mapping = aes(x = factor(timepoint),
y = mean_expression,
fill = case_control)) +
geom_bar(stat = "identity",
position = "dodge",
alpha = 0.8,
color = 'black') +
facet_wrap(vars(antibody),
scales = "free_y",
nrow = 2) +
labs(title = "Antibody expression across visits stratified by case and control",
x = "Timepoint",
y = "Mean Expression",
fill = "Case or Control") +
scale_fill_manual(values = c(Case = "#481567",
Control = "#20A387")) +
theme_bw() +
theme(legend.position = "bottom")ggsave(width = 8,
height = 6,
filename = "05_plot_1.png",
path = "./../results/")
06_pca_antibody
PCA handling and plotting
conversion_table <- tibble(
case_control = c("Case", "Control"),
num_case_control = c(1, 0)
)
data <- read_tsv("./../data/03_dat_aug.tsv",
show_col_types = FALSE) |>
pivot_wider(names_from = "antibody",
values_from = "expression") |>
select(case_control, GAD65, IA_2, IAA, ZnT8) |>
mutate(across(starts_with("GAD65"),
~ if_else(is.na(.), 0.001, .)),
across(starts_with("IA_2"),
~ if_else(is.na(.), 0.001, .)),
across(starts_with("IAA"),
~ if_else(is.na(.), 0.001, .)),
across(starts_with("ZnT8"),
~ if_else(is.na(.), 0.001, .))) |>
left_join(conversion_table,
by = "case_control")
data_pca <- data |>
select(-case_control,
-num_case_control)|>
scale() |>
prcomp()
data_pca |>
augment(data) |>
ggplot(data = _,
mapping = aes(x = .fittedPC1,
y = .fittedPC2,
color = case_control)) +
geom_point(size = 1.5) +
scale_color_manual(values = c(Case = "#481567",
Control = "#20A387")) +
theme_bw() +
labs(title = "Plot of individuals",
color = "Case vs Control")ggsave(width = 8,
height = 5,
filename = "06_plot_1.png",
path = "./../results/")
data_pca |>
tidy(matrix = "eigenvalues") |>
ggplot(mapping = aes(x = PC,
y = percent)) +
geom_col(fill = "#20A387",
alpha = 0.8) +
scale_x_continuous(breaks = 1:9) +
scale_y_continuous(labels = scales::percent_format(),
expand = expansion(mult = c(0, 0.01))) +
geom_line(mapping = aes(x = PC),
color = "#481567") +
theme_bw() +
labs(title = "Explained Variance by Principal Components",
y = "Percentage Contribution")ggsave(width = 8,
height = 5,
filename = "06_plot_2.png",
path = "./../results/")
arrow_style <- arrow(length = unit(0.2, "cm"),
ends = "first")
num_case_control_coord <- cor(data$num_case_control, data_pca$x) |>
as_tibble()
data_pca |>
tidy(matrix = "rotation") |>
pivot_wider(names_from = "PC",
names_prefix = "PC",
values_from = "value") |>
ggplot(mapping = aes(x = PC2,
y = PC1)) +
geom_point(alpha = 0) +
coord_flip() +
theme_bw() +
labs(title = "Rotation Matrix",
subtitle = "Exploring Variability Along Principal Components") +
geom_segment(mapping = aes(xend = 0,
yend = 0),
arrow = arrow_style,
color = "#481567") +
geom_segment(data = num_case_control_coord,
mapping = aes(xend = 0,
yend = 0),
arrow = arrow_style,
color = "#20A387") +
geom_label_repel(data = num_case_control_coord,
mapping = aes(label = "case_control"),
color = "#20A387",
size = 3.5) +
geom_label_repel(mapping = aes(label = column),
color = "#481567",
size = 3.5)ggsave(width = 8,
height = 5,
filename = "06_plot_3.png",
path = "./../results/")
07_HLA_allele_analysis
data <- read_tsv("./../data/03_dat_aug.tsv",
show_col_types = FALSE)
HLA_tidy <- data |>
filter(timepoint == 4) |>
mutate(allele = str_remove(string = allele,
pattern = "(\\,1)|(\\,2)")) |>
mutate(allele_subtype = str_c(allele,
subtype,
sep = ":")) |>
drop_na(allele_subtype) |>
mutate(subtype_n = 1) |>
select(case_control, id, allele_subtype, subtype_n) |>
group_by(case_control, allele_subtype) |>
summarize(allele_n = n(), .groups = "drop") |>
pivot_wider(names_from = case_control,
values_from = allele_n) |>
na.omit(Control, Case) |>
mutate(ratio = round(Case/Control,
digits = 2)) |>
select(allele_subtype, ratio)
ggplot(data = HLA_tidy,
mapping = aes(x = reorder(allele_subtype, +ratio),
y = ratio)) +
geom_bar(stat = "identity",
color = "#20A387",
fill = "#20A387",
alpha = 0.8) +
coord_flip() +
labs(title = "Ratio of HLA subtype occurence for case and control patients",
x = "Allele subtype",
y = "Ratio (Case/Control)") +
theme_bw() +
theme(panel.grid.major.y = element_blank(),
axis.text.x= element_text(size=10),
axis.text.y= element_text(size=7))ggsave(width = 10,
height = 10,
filename = "07_plot_2.png",
path = "./../results/")data <- read_tsv("./../data/03_dat_aug.tsv",
show_col_types = FALSE)
HLA_tidy <-
HLA_tidy |>
ungroup() |>
mutate(allele = str_extract(string = allele_subtype,
pattern = "HLA[^\\:]+"),
subtype = str_extract(string = allele_subtype,
pattern = "\\d{4}"))
ggplot(data = HLA_tidy,
mapping = aes(x = subtype,
y = allele)) +
geom_tile(aes(fill = ratio),
color="#481567") +
theme_bw()+
theme(axis.text.y = element_text(size = 10),
axis.text.x = element_text(angle = 90,
size = 10,
vjust=0.5),
panel.grid.major.y = element_blank())+
scale_fill_gradient(low = "#20A387",
high = "#481567") +
labs(title = "Ratio of HLA subtypes in case and control patients",
x = "Subtype",
y = "Allele",
fill = "Case/Control")ggsave(width = 8,
height = 5,
filename = "07_plot_1.png",
path = "./../results/")
08_pca_HLA
Load Data
data <- read_tsv("./../data/03_dat_aug.tsv",
show_col_types = FALSE)Analysis
MCA HLA subtypes
Percentage of variance explained
HLA_tidy <- data |>
filter(timepoint == 1) |>
select(case_control, id, allele, subtype) |>
mutate(allele_subtype = str_c(allele,
subtype,
sep = ":")) |>
na.omit(allele_subtype) |>
group_by(case_control, id, allele_subtype) |>
summarize(subtype_n = n(),
.groups = "drop") |>
mutate(subtype_n = str_replace(string = subtype_n,
pattern = "4", "yes")) |>
pivot_wider(names_from = allele_subtype,
values_from = subtype_n) |>
mutate(across(everything(),
~replace_na(.x, "no"))) |>
mutate(across(starts_with("HLA"), as.factor)) |>
mutate(patient = str_c(case_control,
id,
sep = ":"))
relevant_var <- unname(as_vector( HLA_tidy |>
pivot_longer(cols = starts_with("HLA"),
names_to = "subtype",
values_to = "presence") |>
select(subtype,
presence) |>
filter(presence == "yes") |>
group_by(subtype) |>
summarize(freq = n()/54) |>
ungroup() |>
filter(freq >= 0.30) |>
select(subtype) ))
HLA.active <- HLA_tidy |>
select(all_of( relevant_var ))
mca_fit <- MCA(HLA.active,
graph = FALSE)
as_tibble_row(mca_fit) |>
select(eig) |>
unnest(eig) |>
reduce("*") |>
as_tibble() |>
rename("eigenvalue" = 1,
"var_percent" = 2,
"cum_var_percent" = 3) |>
mutate(Dim = str_c("Dim", seq(1,13)))|>
ggplot(aes(x = reorder(Dim, - var_percent),
y = var_percent)) +
geom_col(fill = "#481567",
alpha = 0.8) +
geom_text(aes(label = round(var_percent,1)),
size = 3,
vjust = -0.3) +
theme(axis.text.x = element_text(angle = 90),
panel.grid.minor = element_blank()) +
theme_bw() +
labs(title = "Variance Explained by Each Dimension",
x = "Dimension",
y = "Percentage of Variance explained")ggsave(width = 8,
height = 5,
filename = "08_plot_1.png",
path = "./../results/")Allele subtype category (present or not) representation
coordinates <- as_tibble_row(mca_fit) |>
select(var) |>
unnest_wider(var) |>
select(coord)|>
unnest(coord) |>
reduce("*") |>
as_tibble() |>
select(1,2) |>
rename("coord_Dim1" = 1,
"coord_Dim2" = 2) |>
mutate(id=seq(1,26))
cos2values <- as_tibble_row(mca_fit) |>
select(var) |>
unnest_wider(var) |>
select(cos2)|>
unnest(cos2) |>
reduce("*") |>
as_tibble() |>
select(1, 2) |>
rename("cos2_Dim1" = 1,
"cos2_Dim2" = 2) |>
mutate(id = seq(1,26))
full_join(coordinates,
cos2values,
by = join_by(id == id)) |>
mutate(HLA = rownames(data.frame( mca_fit$var )),
representation = cos2_Dim1 + cos2_Dim2) |>
ggplot(data = _,
mapping=aes(x = coord_Dim1,
y = coord_Dim2,
label = HLA)) +
geom_vline(xintercept = 0,
linetype = "dashed",
linewidth = 0.2) +
geom_hline(yintercept = 0,
linetype = "dashed",
linewidth = 0.2) +
geom_point(aes(color = representation)) +
geom_label_repel(aes(color = representation),
force_pull = 0,
size = 2,
nudge_y = 0.4,
direction = "y") +
scale_color_gradient2(low="#20A387",
mid="#f68f46ff",
high="#481567",
midpoint=0.4) +
theme_bw() +
labs(title = "Dimension 2 vs. Dimension 1",
subtitle = "Allele subtype Categories stratified by dimensional representation of category",
x = "Dim 1",
y = "Dim 2",
color = "Cos2")ggsave(width = 8,
height = 5,
filename = "08_plot_2.png",
path = "./../results/")Individuals in 2D.
as_tibble_row(mca_fit) |>
select(ind) |>
unnest_wider(ind) |>
select(coord)|>
unnest(coord) |>
reduce("*") |>
as_tibble() |>
select(1, 2) |>
rename("coord_Dim1" = 1,
"coord_Dim2" = 2) |>
mutate(case_control = HLA_tidy |> pull(case_control),
patient = HLA_tidy |> pull(patient)) |>
ggplot(data = _,
mapping = aes(x = coord_Dim1,
y = coord_Dim2,
label = patient)) +
geom_vline(xintercept = 0,
linetype = "dashed",
linewidth = 0.2) +
geom_hline(yintercept = 0,
linetype = "dashed",
linewidth = 0.2) +
geom_label_repel(mapping = aes(color = case_control),
size = 2,
max.overlaps = 15) +
scale_color_manual(values = c(Case = "#481567",
Control = "#20A387")) +
theme_bw() +
labs(title = "Dimension 2 vs. Dimension 1",
subtitle = "Patients stratified by outcome",
x = "Dim 1",
y = "Dim 2",
color = "Case/Control")ggsave(width = 8,
height = 5,
filename = "08_plot_3.png",
path = "./../results/")None of the dimensions explain enough of the variance themselves. The allele subtype category representation of dimension 1 and 2 show promise of investigative allele subtypes, however since the variance explained by the dimensions is low, no conclusion can be drawn. The patient representation gives less evidence of sufficient variance explained by dimension 1 and 2. A four dimensional explanation of variance could be investigated, as the first 4 dimensions explain a little over 70% of the variance.